!pip install folium plotly
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import folium
import plotly.graph_objects as go
import plotly
from bs4 import BeautifulSoup
import requests
from IPython.display import Image, display
mapbox_access_token = 'pk.eyJ1Ijoic3VubnkyMzA5IiwiYSI6ImNrMnlvNGRoMzA5YW0zY29tMGVhaWNwa2YifQ.E6iZyoNa1Inbf93yh-kjhw'
pd.set_option('display.max_columns',120)
pd.set_option('display.max_rows',100)
%matplotlib inline
food_inspection = pd.read_csv('Food_Inspection (1).csv')
#print('Columns : ',food_inspection.columns)
print('Food Inspection Dataframe Size : ',food_inspection.shape)
food_inspection.head()
food_inspection[['Category']].drop_duplicates()
We have created another column named Retaurant which has value True if entry is for restaurant else False. We have selected entry Restaurant as retaurant type to be considered.
list_of_restaurants = ['Fast Food', 'Restaurant', 'Cafeteria', 'Seafood', 'Buffet', 'Diner','Pizza']
#list_of_restaurants = ['Restaurant']
food_inspection['Restaurant'] = [True if val in list_of_restaurants else False for val in food_inspection['Category'].values]
#food_inspection[food_inspection.Restaurant == True]['Establishment ID'].value_counts()
food_inspection.head()
print('Food Inspection Date Datatype Before', food_inspection['Inspection Date'].dtype)
food_inspection['Inspection Date'] = pd.to_datetime(food_inspection['Inspection Date'])
print('Food Inspection Date Datatype Before', food_inspection['Inspection Date'].dtype)
food_inspection['Inspection_year'] = [dt.year for dt in food_inspection['Inspection Date']]
food_inspection['Inspection_month'] = [dt.month for dt in food_inspection['Inspection Date']]
food_inspection['Inspection_year_month'] = food_inspection['Inspection_month'].map(str) + '_' + food_inspection['Inspection_year'].map(str)
food_inspection[['Inspection Date', 'Inspection_year', 'Inspection_month', 'Inspection_year_month']]
compliance_type_columns = ['Food from Approved Source (C)', 'Food Protected from Contamination (C)', 'Ill Workers Restricted (C)', \
'Proper Hand Washing (C)', 'Cooling Time and Temperature (C)', 'Cold Holding Temperature (C)',\
'Hot Holding Temperature (C)', 'Cooking Time and Temperature (C)', 'Reheating Time and Temperature (C)',\
'Hot and Cold Running Water Provided (C)', 'Proper Sewage Disposal (C)', 'Toxic Substances & Pesticides', \
'Rodent and Insects', 'Nutritional Labeling', 'Trans Fat Ban', 'No-Smoking Sign Posted']
for col in compliance_type_columns:
print(col,' : ',food_inspection[col].unique())
for col in compliance_type_columns:
food_inspection[col] = [(1 if val=='Out of Compliance' else 0 if val=='In Compliance' else np.nan ) for val in food_inspection[col]]
food_inspection[compliance_type_columns]
food_inspection['Num_of_violations'] = np.nansum(food_inspection[compliance_type_columns].values, axis=1)
food_inspection['Num_of_violations'][:10]
food_inspection['Out_of_compliance'] = [1 if val>0 else 0 for val in food_inspection['Num_of_violations']]
food_inspection['Out_of_compliance'][:10]
establishment_id_to_inspection_cnt = Counter(food_inspection[food_inspection.Restaurant == True]['Establishment ID'])
establishment_id_to_inspection_cnt = dict([(key,val) for key, val in establishment_id_to_inspection_cnt.items() if val > 1])
food_inspection_multi_insp = food_inspection[food_inspection['Establishment ID'].isin(establishment_id_to_inspection_cnt.keys())].copy()
sub_food_inspection = food_inspection_multi_insp[['Establishment ID', 'Category', 'Inspection Date', 'Num_of_violations']]
sub_food_inspection.head()
total_data = []
for idx, df in sub_food_inspection.groupby('Establishment ID'):
row_data = [df['Establishment ID'].values[0],df['Category'].values[0]]
dates_violations = list(zip(df['Inspection Date'],df['Num_of_violations']))
for d_v in dates_violations:
row_data.extend(d_v)
total_data.append(row_data)
max_cnt = max([len(row) for row in total_data])
print(max_cnt)
columns = ['Establishment ID', 'Category']
for i in range((max_cnt-2)//2):
columns.extend(['Inspection_date_%d'%(i+1),'Inspection_%d'%(i+1)])
est_to_datewise_insp_viols = pd.DataFrame(total_data, columns=columns)
est_to_datewise_insp_viols.head(10)
violation_type_to_cnt = pd.DataFrame(food_inspection[food_inspection.Restaurant == True][compliance_type_columns].sum().sort_values(ascending=False)).rename(columns={0:'Num_of_violations'})
violation_type_to_cnt
Findings: From above we can see that Cold_holding_temperature is most common type of violation.
all_data = []
for i in range(2, (max_cnt-2)//2):
sub_df = est_to_datewise_insp_viols[est_to_datewise_insp_viols['Inspection_%d'%i] == 0]
row_data = [sub_df['Inspection_1'].sum(),]
for j in range(2, i+1):
row_data.append(sub_df['Inspection_%d'%j].sum())
all_data.append(row_data)
max_cnt = max([len(row) for row in all_data])
index = ['%d Inspections'%(i+1) for i in range(len(all_data))]
columns = ['Never Compliant']
columns.extend(['Compliant After %d Re-Inspections'%(i+1) for i in range(max_cnt-1)])
no_violations_to_become_compliant = pd.DataFrame(all_data, columns = columns, index=index)
no_violations_to_become_compliant.loc['Total'] = no_violations_to_become_compliant[1:].sum()
no_violations_to_become_compliant['Total'] = no_violations_to_become_compliant.sum(axis=1)
no_violations_to_become_compliant[1:]
Findings: From above table we can see that many establishments keeps on failing to stay compliant. We can say that it takes around 7 re-inspections until establishment has become compliant. Very few are staying non-compliant after 6-7 inspections.
with plt.style.context(('seaborn', 'ggplot')):
violation_type_to_cnt.plot(kind='bar', width=0.8)
plt.xlabel('Violation Type')
plt.ylabel('Number Of Violations')
plt.title('Number of Violations per Violation Type')
percent_violation_per_month = food_inspection[food_inspection.Restaurant == True].groupby(by='Inspection_month').mean()[['Out_of_compliance']] * 100
percent_violation_per_month
with plt.style.context(('seaborn', 'ggplot')):
percent_violation_per_month.plot()
plt.ylabel('% of Violations')
plt.xticks(range(12), range(1,13))
plt.title('% of Violations Per Month')
Findings: We can see from above graph that restaurants have less violations during year end and start. It has noticed violations keeps on going down till July and then it start increasing again peeking during September and then drops again
percent_violation_per_year = food_inspection[food_inspection.Restaurant == True].groupby(by='Inspection_year').mean()[['Out_of_compliance']] * 100
percent_violation_per_year
with plt.style.context(('seaborn', 'ggplot')):
percent_violation_per_year.plot()
plt.ylabel('% of Violations')
mini, maxi = int(percent_violation_per_year.index[0]), int(percent_violation_per_year.index[-1])+1
plt.title('% of Violations Per Year')
#plt.xticks(range(3), [2017,2018,2019])
Findings: From above graph, we can see that restaurants violations are going down year by year and in 2019 are quite below 40%.
out_of_compliance_count_per_rest = food_inspection[food_inspection.Restaurant == True].groupby(by='Establishment ID').sum()[['Out_of_compliance']].rename(columns={'Out_of_compliance': 'Out_of_compliance_Cnt'})
food_inspection_locations = food_inspection.drop_duplicates(subset=['Establishment ID']).set_index('Establishment ID')[['Location']]
non_compliance_count_per_rest_with_loc = out_of_compliance_count_per_rest.join(food_inspection_locations, how='left')[['Out_of_compliance_Cnt', 'Location']]
non_compliance_count_per_rest_with_loc = non_compliance_count_per_rest_with_loc.dropna()
non_compliance_count_per_rest_with_loc.head()
print(Counter(non_compliance_count_per_rest_with_loc.Out_of_compliance_Cnt))
locations = np.array([loc.replace('(','').replace(')','').split(',') for loc in non_compliance_count_per_rest_with_loc.Location]).astype(np.float32)
locations[:10]
lng, lat = locations[:,0], locations[:,1]
non_compliance_count_per_rest_with_loc['lat'] = lat
non_compliance_count_per_rest_with_loc['lng'] = lng
non_compliance_count_per_rest_with_loc.head()
mid_lat = (non_compliance_count_per_rest_with_loc.lat.min() + non_compliance_count_per_rest_with_loc.lat.max())/2
mid_lng = (non_compliance_count_per_rest_with_loc.lng.min() + non_compliance_count_per_rest_with_loc.lng.max())/2
folium_map = folium.Map(location=(mid_lat, mid_lng),
zoom_start=12,
tiles="CartoDB dark_matter")
for e_id in non_compliance_count_per_rest_with_loc.index:
folium.CircleMarker(location=(non_compliance_count_per_rest_with_loc.loc[e_id]['lat'], non_compliance_count_per_rest_with_loc.loc[e_id]['lng']),
radius=float(non_compliance_count_per_rest_with_loc.loc[e_id]['Out_of_compliance_Cnt']+1),
color='red' if non_compliance_count_per_rest_with_loc.loc[e_id]['Out_of_compliance_Cnt'] > 0 else 'white',
popup=str(e_id) + ':' + str(non_compliance_count_per_rest_with_loc.loc[e_id]['Out_of_compliance_Cnt']),
tooltip=str(e_id) + ', Violations : ' + str(non_compliance_count_per_rest_with_loc.loc[e_id]['Out_of_compliance_Cnt']),
fill=True).add_to(folium_map)
folium_map.save('loc_wise_map.html')
mid_lat = (non_compliance_count_per_rest_with_loc.lat.min() + non_compliance_count_per_rest_with_loc.lat.max())/2
mid_lng = (non_compliance_count_per_rest_with_loc.lng.min() + non_compliance_count_per_rest_with_loc.lng.max())/2
fig = go.Figure()
for e_id in non_compliance_count_per_rest_with_loc.index:
cnt = non_compliance_count_per_rest_with_loc.loc[e_id]['Out_of_compliance_Cnt']
fig.add_trace(go.Scattermapbox(
lat= [non_compliance_count_per_rest_with_loc.loc[e_id].lat],
lon=[non_compliance_count_per_rest_with_loc.loc[e_id].lng],
mode='markers',
marker=go.scattermapbox.Marker(
size= 8 if cnt == 0 else 12,
color = 'white' if cnt==0 else 'red',
opacity=0.7
),
text=[str(e_id) + ', Violations : ' + str(non_compliance_count_per_rest_with_loc.loc[e_id]['Out_of_compliance_Cnt'])],
))
fig.update_layout(
title='Violation Count distribution',
autosize=True,
hovermode='closest',
showlegend=False,
mapbox=go.layout.Mapbox(
accesstoken=mapbox_access_token,
bearing=0,
center=go.layout.mapbox.Center(
lat=mid_lat,
lon=mid_lng
),
pitch=0,
zoom=9,
style = 'dark'
),
)
fig.show()
food_inspection[food_inspection.Restaurant == True].groupby('City').sum()[compliance_type_columns].sort_values(compliance_type_columns, ascending=False)
Findings: After looking at interactive visualization, we can come to conclusion that majority of violations count per establishment stands around 5-7. We can see that there are few areas where there are clusters of establishments which are defaulters with around 5-7 violations like Berwyn Heights,College Park, Kolbes Corner, RITCHIE, Hilcrest Heights, Upper Marlboro, Clinton, and few more.
We also noticed that cities like RIVERDALE, LAUREL,BOWIE, UPPER MARLBORO, COLLEGE PARK,CLITON, HYATTSVILLE, LANDOVER, BLADENSBURG, FORT WASHINGTON, WEST HYATTSVILLE, NEW CARROLLTON are few cities where all violations are quite common.
Food_from_approved_source, Ill_workers_restricted, Cooling_time_and_temperature, Cooking_time_and_temperature, Reheating_time_and_temperature, Hot_and_cold_running_water_provided, Proper_sewage_disposal, No_bare_hand_contact, Adequate_hand_washing_facilities are very least occurring violations.
Food_protected_from_contamination, Proper_hand_washing, Cold_holding_temperature, Hot_holding_temperature, Rodent_and_insects, Food_contact_surfaces_and_equipment are quite commonly occurring violations.
This assignment builds off of previous Exercise from weeks ago, scraping data from https://pokemondb.net
soup = BeautifulSoup(requests.get('https://pokemondb.net/pokedex/all').content, 'html.parser')
all_rows = soup.find_all('tr')
def create_data_frame(all_rows):
header_names = []
rows_data = []
for row in all_rows:
single_row_data = []
headers = row.find_all('th')
for header in headers:
header_names.append(header.text)
if header.text == 'Name':
header_names.append('URL')
row_content = row.find_all('td')
for val in row_content:
single_row_data.append(val.text)
if val.get('class')[0] == 'cell-name':
hyperlink = val.find('a')
single_row_data.append(hyperlink['href'])
if single_row_data:
rows_data.append(single_row_data)
return pd.DataFrame(rows_data, columns=header_names)
final_df = create_data_frame(all_rows)
final_df['#'] = final_df['#'].astype('int')
final_df['Total'] = final_df['Total'].astype('int')
final_df['HP'] = final_df['HP'].astype('int')
final_df['Attack'] = final_df['Attack'].astype('int')
final_df['Defense'] = final_df['Defense'].astype('int')
final_df['Sp. Atk'] = final_df['Sp. Atk'].astype('int')
final_df['Sp. Def'] = final_df['Sp. Def'].astype('int')
final_df['Speed'] = final_df['Speed'].astype('int')
final_df['Type'] = final_df['Type'].str.strip()
types = []
for val in final_df.Type:
types.extend(val.strip().split(' '))
unique_types = list(set(types))
types_data = []
for row in final_df.Type:
row_type_dummy = [1 if typ in row else 0for typ in unique_types]
types_data.append(row_type_dummy)
types_data = np.array(types_data)
for i, typ in enumerate(unique_types):
final_df[typ] = types_data[:, i]
final_df = final_df.drop_duplicates(subset=['URL'])
final_df.head(10)
final_df['Sample'] = ['sample' if i%4 == 0 else None for i in final_df['#']]
final_df['Sample2'] = ['sample' if i%12 == 0 else None for i in final_df['#']]
final_df.head(15)
for pokemon_name in final_df[final_df.Sample2 == 'sample'].Name.values:
soup = BeautifulSoup(requests.get('https://pokemondb.net/pokedex/%s'%pokemon_name).content)
try:
image_url = soup.find_all('img')[0]['src']
except:
image_url = None
#image_content = requests.get(image_url).content
if image_url:
display(Image(url=image_url, unconfined=True))
pokemon_name = 'bulbasaur'
url = 'https://pokemondb.net/pokedex/%s'%pokemon_name
tables = pd.read_html(requests.get(url, headers={'User-agent': 'Mozilla/5.0'}).text)
tables[-2]
loc_cols = ['Red','Blue', 'Yellow', 'Gold', 'Silver', 'Crystal', 'Ruby', 'Sapphire', 'Emerald', 'FireRed', 'LeafGreen',\
'Diamond', 'Pearl', 'Platinum', 'HeartGold', 'SoulSilver','Black', 'White', 'Black 2', 'White 2', 'X','Y',\
'Omega Ruby', 'Alpha Sapphire', "Let's Go Pikachu", "Let's Go Eevee"]
game_dataframe = pd.DataFrame([tables[-2][1].values], columns=tables[-2][0].values)
game_dataframe
row_data = []
for col in loc_cols:
col_data_found = False
for g_col in game_dataframe.columns:
if col in g_col:
row_data.append(game_dataframe[g_col].values[0])
col_data_found = True
break
if not col_data_found:
row_data.append(None)
game_dataframe_modified = pd.DataFrame([row_data], columns=loc_cols)
game_dataframe_modified
pokemons, game_dataframes = [], []
for pokemon_name in final_df[final_df.Sample == 'sample'].Name:
url = 'https://pokemondb.net/pokedex/%s'%pokemon_name
try:
tables = pd.read_html(requests.get(url, headers={'User-agent': 'Mozilla/5.0'}, timeout=3).text)
game_dataframe = pd.DataFrame([tables[-2][1].values], columns=tables[-2][0].values)
game_dataframe.columns = [col.strip() for col in game_dataframe.columns]
except:
continue
if game_dataframe.shape[0]>0:
game_dataframes.append(game_dataframe)
pokemons.append(pokemon_name)
print('Number of game dataframes %d'%len(game_dataframes))
total_data = []
for pokemon, df in zip(pokemons, game_dataframes):
row_data = [pokemon]
for col in loc_cols:
col_data_found = False
for g_col in df.columns:
if col in g_col:
row_data.append(df[g_col].values[0])
col_data_found = True
break
if not col_data_found:
row_data.append(None)
total_data.append(row_data)
locations_data = pd.DataFrame(total_data, columns = ['Name'] + loc_cols).set_index('Name')
locations_data
final_df_sample = final_df[final_df['Sample'] == 'sample'].set_index('Name')
print('Sample Dataset Size : ',final_df_sample.shape)
final_df_with_loc_data = final_df_sample.join(locations_data, how='left')
final_df_with_loc_data
avg_total_point_per_location = final_df_with_loc_data.groupby(by='X').mean()[['Total']]
avg_total_point_per_location.head()
avg_total_point_per_location.sort_values(by='Total', ascending=False).head()
Location "Team Flare HQ" has highest total as we can see above from sorted values.